library("ggplot2")
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.4 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
## tripduration starttime stoptime
## 1 564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2 1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3 763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4 915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5 1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6 267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
## start.station.id start.station.name start.station.latitude
## 1 3711 E 13 St & Avenue A 40.72967
## 2 3016 Kent Ave & N 7 St 40.72037
## 3 382 University Pl & E 14 St 40.73493
## 4 359 E 47 St & Park Ave 40.75510
## 5 3295 Central Park W & W 96 St 40.79127
## 6 3377 Carroll St & Bond St 40.67861
## start.station.longitude end.station.id end.station.name end.station.latitude
## 1 -73.98068 168 W 18 St & 6 Ave 40.73971
## 2 -73.96165 3016 Kent Ave & N 7 St 40.72037
## 3 -73.99201 459 W 20 St & 11 Ave 40.74674
## 4 -73.97499 483 E 12 St & 3 Ave 40.73223
## 5 -73.96484 3142 1 Ave & E 62 St 40.76123
## 6 -73.99037 3398 Smith St & 9 St 40.67470
## end.station.longitude bikeid usertype birth.year gender
## 1 -73.99456 29807 Subscriber 1994 1
## 2 -73.96165 34411 Subscriber 1974 1
## 3 -74.00776 16078 Subscriber 1961 1
## 4 -73.98890 29904 Subscriber 1964 2
## 5 -73.96094 30247 Customer 1969 0
## 6 -73.99786 20315 Subscriber 1971 1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
## STATION NAME DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019 NA 0.06 0 0 NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019 NA 0.00 0 0 NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019 NA 0.00 0 0 NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019 NA 0.00 0 0 NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019 NA 0.50 0 0 NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019 NA 0.00 0 0 NA
## TMAX TMIN
## 1 58 39
## 2 40 35
## 3 44 37
## 4 47 35
## 5 47 41
## 6 49 31
# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame': 100000 obs. of 15 variables:
## $ tripduration : int 564 1158 763 915 1368 267 661 1112 520 512 ...
## $ starttime : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
## $ stoptime : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
## $ start.station.id : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
## $ start.station.name : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
## $ start.station.latitude : num 40.7 40.7 40.7 40.8 40.8 ...
## $ start.station.longitude: num -74 -74 -74 -74 -74 ...
## $ end.station.id : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
## $ end.station.name : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
## $ end.station.latitude : num 40.7 40.7 40.7 40.7 40.8 ...
## $ end.station.longitude : num -74 -74 -74 -74 -74 ...
## $ bikeid : int 29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
## $ usertype : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
## $ birth.year : int 1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
## $ gender : int 1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
## tripduration starttime
## Min. : 61.0 2019-11-22 17:59:58.4760: 2
## 1st Qu.: 362.0 2019-01-01 00:56:30.7720: 1
## Median : 614.0 2019-01-01 01:35:30.5010: 1
## Mean : 950.8 2019-01-01 02:04:41.7180: 1
## 3rd Qu.: 1075.0 2019-01-01 02:25:28.9700: 1
## Max. :2769536.0 2019-01-01 02:33:50.6550: 1
## (Other) :99993
## stoptime start.station.id
## 2019-01-01 01:34:45.0200: 1 519 : 810
## 2019-01-01 01:51:55.8730: 1 3255 : 617
## 2019-01-01 02:13:13.4810: 1 497 : 602
## 2019-01-01 02:29:13.1090: 1 402 : 561
## 2019-01-01 03:04:23.8640: 1 435 : 551
## 2019-01-01 04:09:48.6020: 1 (Other):96523
## (Other) :99994 NA's : 336
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 810 Min. :40.66 Min. :-74.03
## 8 Ave & W 31 St : 617 1st Qu.:40.72 1st Qu.:-74.00
## E 17 St & Broadway : 602 Median :40.74 Median :-73.98
## Broadway & E 22 St : 561 Mean :40.74 Mean :-73.98
## W 21 St & 6 Ave : 551 3rd Qu.:40.76 3rd Qu.:-73.97
## Broadway & E 14 St : 548 Max. :40.85 Max. :-73.88
## (Other) :96311
## end.station.id end.station.name end.station.latitude
## 519 : 792 Pershing Square North: 792 Min. :40.66
## 402 : 636 Broadway & E 22 St : 636 1st Qu.:40.72
## 3255 : 632 8 Ave & W 31 St : 632 Median :40.74
## 497 : 623 E 17 St & Broadway : 623 Mean :40.74
## 285 : 547 Broadway & E 14 St : 547 3rd Qu.:40.76
## (Other):96426 W 21 St & 6 Ave : 544 Max. :40.86
## NA's : 344 (Other) :96226
## end.station.longitude bikeid usertype birth.year
## Min. :-74.03 Min. :14529 Customer :14054 Min. :1885
## 1st Qu.:-74.00 1st Qu.:25346 Subscriber:85946 1st Qu.:1970
## Median :-73.99 Median :30918 Median :1983
## Mean :-73.98 Mean :29674 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35049 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender
## Min. :0.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.161
## 3rd Qu.:1.000
## Max. :2.000
##
# Initial summart of weather data set
str(weather_data)
## 'data.frame': 365 obs. of 10 variables:
## $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
## $ NAME : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
## $ DATE : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
## $ AWND : num NA NA NA NA NA NA NA NA NA NA ...
## $ PRCP : num 0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
## $ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TAVG : logi NA NA NA NA NA NA ...
## $ TMAX : int 58 40 44 47 47 49 34 45 45 34 ...
## $ TMIN : int 39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
## tripduration starttime
## Min. : 61.0 2019-11-22 17:59:58.4760: 2
## 1st Qu.: 362.0 2019-01-01 00:56:30.7720: 1
## Median : 614.0 2019-01-01 01:35:30.5010: 1
## Mean : 950.8 2019-01-01 02:04:41.7180: 1
## 3rd Qu.: 1075.0 2019-01-01 02:25:28.9700: 1
## Max. :2769536.0 2019-01-01 02:33:50.6550: 1
## (Other) :99993
## stoptime start.station.id
## 2019-01-01 01:34:45.0200: 1 519 : 810
## 2019-01-01 01:51:55.8730: 1 3255 : 617
## 2019-01-01 02:13:13.4810: 1 497 : 602
## 2019-01-01 02:29:13.1090: 1 402 : 561
## 2019-01-01 03:04:23.8640: 1 435 : 551
## 2019-01-01 04:09:48.6020: 1 (Other):96523
## (Other) :99994 NA's : 336
## start.station.name start.station.latitude start.station.longitude
## Pershing Square North: 810 Min. :40.66 Min. :-74.03
## 8 Ave & W 31 St : 617 1st Qu.:40.72 1st Qu.:-74.00
## E 17 St & Broadway : 602 Median :40.74 Median :-73.98
## Broadway & E 22 St : 561 Mean :40.74 Mean :-73.98
## W 21 St & 6 Ave : 551 3rd Qu.:40.76 3rd Qu.:-73.97
## Broadway & E 14 St : 548 Max. :40.85 Max. :-73.88
## (Other) :96311
## end.station.id end.station.name end.station.latitude
## 519 : 792 Pershing Square North: 792 Min. :40.66
## 402 : 636 Broadway & E 22 St : 636 1st Qu.:40.72
## 3255 : 632 8 Ave & W 31 St : 632 Median :40.74
## 497 : 623 E 17 St & Broadway : 623 Mean :40.74
## 285 : 547 Broadway & E 14 St : 547 3rd Qu.:40.76
## (Other):96426 W 21 St & 6 Ave : 544 Max. :40.86
## NA's : 344 (Other) :96226
## end.station.longitude bikeid usertype birth.year
## Min. :-74.03 Min. :14529 Customer :14054 Min. :1885
## 1st Qu.:-74.00 1st Qu.:25346 Subscriber:85946 1st Qu.:1970
## Median :-73.99 Median :30918 Median :1983
## Mean :-73.98 Mean :29674 Mean :1980
## 3rd Qu.:-73.97 3rd Qu.:35049 3rd Qu.:1990
## Max. :-73.89 Max. :42046 Max. :2003
##
## gender
## Min. :0.000
## 1st Qu.:1.000
## Median :1.000
## Mean :1.161
## 3rd Qu.:1.000
## Max. :2.000
##
# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")
rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)
# Combining data frames to compare data
edited_weather <- select(weather_data,
PRCP,
SNOW,
AWND,
DATE)
edited_rider <- select(rider_2019_sample,
age,
gender,
usertype,
tripduration,
start.station.latitude,
start.station.longitude,
start.station.id,
start.station.name,
end.station.latitude,
end.station.longitude,
end.station.id,
end.station.name,
DATE,
Day,
Month,
Year)
total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
## DATE PRCP SNOW AWND age gender usertype tripduration
## 1 2019-01-01 0.06 0 NA 52 1 Subscriber 1166
## 2 2019-01-01 0.06 0 NA 33 1 Subscriber 532
## 3 2019-01-01 0.06 0 NA 55 1 Subscriber 263
## 4 2019-01-01 0.06 0 NA 29 1 Subscriber 196
## 5 2019-01-01 0.06 0 NA 28 1 Subscriber 710
## 6 2019-01-01 0.06 0 NA 37 2 Subscriber 312
## start.station.latitude start.station.longitude start.station.id
## 1 40.72037 -73.96165 3016
## 2 40.67583 -73.95617 3569
## 3 40.74517 -73.98683 474
## 4 40.72308 -73.98584 3656
## 5 40.75187 -73.97771 519
## 6 40.71422 -73.98135 502
## start.station.name end.station.latitude end.station.longitude
## 1 Kent Ave & N 7 St 40.72080 -73.95485
## 2 Franklin Ave & St Marks Ave 40.69073 -73.95133
## 3 5 Ave & E 29 St 40.74034 -73.98955
## 4 E 2 St & Avenue A 40.72087 -73.98086
## 5 Pershing Square North 40.73222 -73.98166
## 6 Henry St & Grand St 40.72217 -73.98369
## end.station.id end.station.name Day Month Year
## 1 3101 N 12 St & Bedford Ave 01 01 2019
## 2 3056 Kosciuszko St & Nostrand Ave 01 01 2019
## 3 402 Broadway & E 22 St 01 01 2019
## 4 150 E 2 St & Avenue C 01 01 2019
## 5 504 1 Ave & E 16 St 01 01 2019
## 6 301 E 2 St & Avenue B 01 01 2019
# Reclassifying the genders
# 0=unknown, 1=male, 2=female
total_data$gender <- ifelse(total_data$gender == 0, "Unknown",
ifelse(total_data$gender == 1, "Male", "Female"))
# Seeing the split of genders who rented bikes
total_data %>%
ggplot(aes(x=gender)) +
geom_bar()
# Seeing the split of user type who rented bikes
total_data %>%
ggplot(aes(x=usertype)) +
geom_bar()
# Range of all bike rides
total_data <- filter(total_data, tripduration <= 3600)
duration_range <- range(total_data$tripduration, na.rm=TRUE)
duration_range
## [1] 61 3599
# Average length of a bike ride
duration_mean <- mean(total_data$tripduration, na.rm=TRUE)
duration_mean
## [1] 789.341
# Standard deviation of bike rides
duration_sd <- sd(total_data$tripduration, na.rm=TRUE)
duration_sd
## [1] 587.415
# Range of all bike rides affected by rain
total_data_rain <- filter(total_data, SNOW == 0, PRCP > 0)
duration_range_rain <- range(total_data_rain$tripduration, na.rm=TRUE)
duration_range_rain
## [1] 61 3598
# Average length of a bike ride affected by rain
duration_mean_rain <- mean(total_data_rain$tripduration, na.rm=TRUE)
duration_mean_rain
## [1] 777.5114
# Standard deviation of bike rides affected by rain
duration_sd_rain <- sd(total_data_rain$tripduration, na.rm=TRUE)
duration_sd_rain
## [1] 575.1325
# Range of all bike rides affected by snow
total_data_snow <- filter(total_data, SNOW > 0)
duration_range_snow <- range(total_data_snow$tripduration, na.rm=TRUE)
duration_range_snow
## [1] 62 3548
# Average length of a bike ride affected by snow
duration_mean_snow <- mean(total_data_snow$tripduration, na.rm=TRUE)
duration_mean_snow
## [1] 660.3067
# Standard deviation of bike rides affected by snow
duration_sd_snow <- sd(total_data_snow$tripduration, na.rm=TRUE)
duration_sd_snow
## [1] 525.4768
# Range of all bike rides affected by wind
total_data_wind <- filter(total_data, SNOW == 0, PRCP == 0, AWND > 0)
duration_range_wind <- range(total_data_wind$tripduration, na.rm=TRUE)
duration_range_wind
## [1] 61 3599
# Average length of a bike ride affected by wind
duration_mean_wind <- mean(total_data_wind$tripduration, na.rm=TRUE)
duration_mean_wind
## [1] 816.5905
# Standard deviation of bike rides affected by wind
duration_sd_wind <- sd(total_data_wind$tripduration, na.rm=TRUE)
duration_sd_wind
## [1] 601.8395
# Average rain per month
total_data %>%
filter(SNOW == 0) %>%
summarise(average_rain = tapply(PRCP, Month, mean, na.rm=TRUE))
## average_rain
## 1 0.07898309
## 2 0.06682057
## 3 0.06311462
## 4 0.12180282
## 5 0.13145392
## 6 0.14865237
## 7 0.15799374
## 8 0.10023051
## 9 0.02168239
## 10 0.12288957
## 11 0.03855465
## 12 0.17046460
# Average snow per month
total_data %>%
summarise(avg_snow = tapply(SNOW, Month, mean, na.rm=TRUE))
## avg_snow
## 1 0.03033660
## 2 0.05669672
## 3 0.18865182
## 4 0.00000000
## 5 0.00000000
## 6 0.00000000
## 7 0.00000000
## 8 0.00000000
## 9 0.00000000
## 10 0.00000000
## 11 0.00000000
## 12 0.06747897
# Average wind speed per month
total_data %>%
summarise(average_wind_speed = tapply(AWND, Month, mean, na.rm=TRUE))
## average_wind_speed
## 1 NaN
## 2 NaN
## 3 4.918059
## 4 4.345938
## 5 3.726322
## 6 4.111195
## 7 3.405346
## 8 3.847932
## 9 4.285026
## 10 5.247129
## 11 5.304712
## 12 6.341162
# Trip duration by age of riders and rain amount
plot_data <- total_data %>%
filter(SNOW == 0) %>%
group_by(age) %>%
summarise(mean_PRCP_by_age = mean(PRCP),
mean_duration = mean(tripduration))
## `summarise()` ungrouping output (override with `.groups` argument)
plot_data %>%
ggplot(aes(x = age, y = mean_PRCP_by_age)) +
geom_point(alpha =0.9, shape = 18, colour = "blue", size = plot_data$mean_duration/150) +
geom_smooth(colour = "orange")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Mean Wind by Age of Rider
total_data %>%
group_by(age) %>%
summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth()
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Average ride time when it's raining
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
summarise(prcp_duration_mean = mean(tripduration))
## prcp_duration_mean
## 1 777.5114
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
ggplot(aes(x = tripduration)) +
geom_density(aes(fill=factor(PRCP)), alpha=0.8)
total_data %>%
filter(PRCP > 0, SNOW == 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Average ride time when it's snowing
total_data %>%
filter(SNOW > 0) %>%
summarise(snow_duration_mean = mean(tripduration))
## snow_duration_mean
## 1 660.3067
total_data %>%
filter(SNOW > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
total_data %>%
filter(SNOW > 0) %>%
ggplot(aes(x = tripduration)) +
geom_density(aes(fill=factor(SNOW)), alpha=0.8)
total_data %>%
filter(SNOW > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Average ride time when it's windy
total_data %>%
filter(AWND > 0) %>%
summarise(wind_duration_mean = mean(tripduration))
## wind_duration_mean
## 1 803.9685
total_data %>%
filter(AWND > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
total_data %>%
filter(AWND > 0) %>%
ggplot(aes(x = tripduration)) +
geom_density(aes(fill=factor(AWND)), alpha=0.8)
total_data %>%
filter(AWND > 0) %>%
ggplot(aes(x = tripduration)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Number of rides over average time sin weather effects
ride_num <- total_data %>%
filter(tripduration > duration_mean) %>%
count()
ride_num[1,1]
## [1] 37200
# Number of rides over average time with rain
rain_num <- total_data %>%
filter(SNOW == 0, PRCP > 0, tripduration > duration_mean) %>%
count()
rain_num[1,1]
## [1] 12452
# Number of rides over average time with snow
snow_num <- total_data %>%
filter(SNOW > 0, tripduration > duration_mean) %>%
count()
snow_num[1,1]
## [1] 503
# Number of rides over average time with wind
wind_num <- total_data %>%
filter(AWND > 0, tripduration > duration_mean) %>%
count()
wind_num[1,1]
## [1] 34200
# Distance between start and end station in Meters
total_data <- mutate(total_data,
distance = distHaversine(cbind(total_data$start.station.longitude,
total_data$start.station.latitude),
cbind(total_data$end.station.longitude,
total_data$end.station.latitude)))
head(total_data)
## DATE PRCP SNOW AWND age gender usertype tripduration
## 1 2019-01-01 0.06 0 NA 52 Male Subscriber 1166
## 2 2019-01-01 0.06 0 NA 33 Male Subscriber 532
## 3 2019-01-01 0.06 0 NA 55 Male Subscriber 263
## 4 2019-01-01 0.06 0 NA 29 Male Subscriber 196
## 5 2019-01-01 0.06 0 NA 28 Male Subscriber 710
## 6 2019-01-01 0.06 0 NA 37 Female Subscriber 312
## start.station.latitude start.station.longitude start.station.id
## 1 40.72037 -73.96165 3016
## 2 40.67583 -73.95617 3569
## 3 40.74517 -73.98683 474
## 4 40.72308 -73.98584 3656
## 5 40.75187 -73.97771 519
## 6 40.71422 -73.98135 502
## start.station.name end.station.latitude end.station.longitude
## 1 Kent Ave & N 7 St 40.72080 -73.95485
## 2 Franklin Ave & St Marks Ave 40.69073 -73.95133
## 3 5 Ave & E 29 St 40.74034 -73.98955
## 4 E 2 St & Avenue A 40.72087 -73.98086
## 5 Pershing Square North 40.73222 -73.98166
## 6 Henry St & Grand St 40.72217 -73.98369
## end.station.id end.station.name Day Month Year distance
## 1 3101 N 12 St & Bedford Ave 01 01 2019 576.0106
## 2 3056 Kosciuszko St & Nostrand Ave 01 01 2019 1707.3540
## 3 402 Broadway & E 22 St 01 01 2019 584.0158
## 4 150 E 2 St & Avenue C 01 01 2019 486.4067
## 5 504 1 Ave & E 16 St 01 01 2019 2213.1388
## 6 301 E 2 St & Avenue B 01 01 2019 907.8033
# Speed of the rider
total_data$speed <- total_data$distance/total_data$tripduration
# Average speed of all riders
all_ride <- total_data %>%
summarise(average_speed = mean(speed))
# Average speed of young riders
young_ride <- total_data %>%
filter(age <= 45) %>%
summarise(young_average = mean(speed))
# Average speed of old riders
old_ride <- total_data %>%
filter(age >= 65) %>%
summarise(old_average = mean(speed))
# Average speed of female riders
fem_ride <- total_data %>%
filter(gender == "Female") %>%
summarise(female_average = mean(speed))
# Average speed of male riders
male_ride <- total_data %>%
filter(gender == "Male") %>%
summarise(male_average = mean(speed))
# Average speed of subscribers
sub_ride <- total_data %>%
filter(usertype == "Customer") %>%
summarise(customer_average = mean(speed))
# Average speed of customers
cust_ride <- total_data %>%
filter(usertype == "Subscriber") %>%
summarise(subscriber_average = mean(speed))
Reduce(merge, list(all_ride,
young_ride,
old_ride,
fem_ride,
male_ride,
sub_ride,
cust_ride))
## average_speed young_average old_average female_average male_average
## 1 2.462556 2.538826 2.19201 2.326377 2.572124
## customer_average subscriber_average
## 1 1.801693 2.565832
# Scatter Plot of speed by age
total_data %>%
ggplot(aes(x = age, y = speed, colour = gender)) +
geom_point(alpha = .4, size = 1.5) +
scale_colour_manual(name = 'Gender',
values = setNames(c('blue','magenta', 'dark green'),
c('Male', 'Female', 'Unknown'))) +
geom_smooth(method='lm', colour = 'black') +
labs(title="Average Speed of Riders by Age", x="Speed", y="Age")
## `geom_smooth()` using formula 'y ~ x'
# Boxplot of speed by gender
total_data %>%
ggplot(aes(x = gender, y = speed, colour = gender)) +
geom_boxplot(outlier.colour = 'red') +
scale_colour_manual(name = 'Gender',
values = setNames(c('blue','magenta', 'dark green'),
c('Male', 'Female', 'Unknown'))) +
labs(title="Speed of Riders by Gender", x="Gender", y="Speed")
# Boxplot of speed by customer type
total_data %>%
ggplot(aes(x = usertype, y = speed, colour = usertype)) +
geom_boxplot(outlier.colour = 'red') +
scale_colour_manual(name = 'User Type',
values = setNames(c('purple', 'orange'),
c('Subscriber', 'Customer'))) +
labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")
top_height <- max(total_data$start.station.latitude) - min(total_data$start.station.latitude)
top_width <- max(total_data$start.station.longitude) - min(total_data$start.station.longitude)
top_borders <- c(bottom = min(total_data$start.station.latitude) - 0.1 * top_height,
top = max(total_data$start.station.latitude) + 0.1 * top_height,
left = min(total_data$start.station.longitude) - 0.2 * top_width,
right = max(total_data$start.station.longitude) + 0.2 * top_width)
start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")
start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)
# convert dates to weekdays
total_data$day_of_week = weekdays(total_data$DATE)
start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ day_of_week)
start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)
## break down by one standard deviation above and below average of trip duration
ggmap(start) +
geom_point(data = total_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
col = tripduration)) +
scale_color_gradient(low = "yellow", high = "red")
## before noon and after noon
end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")
end_map + stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)
end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)
total_rides = count(total_data)
test = total_data
test$start.station.name = as.character(test$start.station.name)
test$end.station.name = as.character(test$end.station.name)
test <- test[test$start.station.name==test$end.station.name, ]
same_station = count(test)
same_station / total_rides
## n
## 1 0.01959041
Only ~2.2% of rides start and end at the same station.
start_popularity = sort(table(total_data$start.station.name), decreasing=TRUE)
top10 = round(length(unique(total_data$start.station.name, na.rm=TRUE))*0.1)
top_10 = head(start_popularity, top10)
barplot(top_10)
top_starts = as.data.frame(top_10)
top_10rides = sum(top_starts$Freq)
top_10rides / total_rides
## n
## 1 0.3290259
32.9% of bike rides start from the top 10% most used stations.
inflow vs outflow (start rides / end rides) in 2019
count_starts = as.data.frame(table(total_data$start.station.name))
names(count_starts) = c("station", "starts")
count_ends = as.data.frame(table(total_data$end.station.name))
names(count_ends) = c("station", "ends")
station_flow = as.data.frame(merge(count_starts, count_ends, by.x="station", by.y="station", all.x=TRUE))
station_flow$net = station_flow$starts - station_flow$ends
station_flow %>% mutate(station = fct_reorder(station, net)) %>% ggplot(aes(x=station, y=net)) + geom_point(stat = "identity")+ geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 15 rows containing missing values (geom_point).